# Set of climatic variables# =========================clim_var <-readRDS(here("data/ClimaticData/NamesSelectedVariables.rds"))# Climatic data# =============# we scale the past and future climatic data at the location of the populations# with the parameters (mean and variance) of the past climatic data.source(here("scripts/functions/generate_scaled_clim_datasets.R"))clim_dfs <-generate_scaled_clim_datasets(clim_var,clim_ref_adj =FALSE)
We generate some plots to evaluate the GF models (stored in the files GFplots_[SNP set code].pdf):
Predictor overall importance plots. They show the mean accuracy importance and the mean importance weighted by SNPs \(\mathcal{R}^2\).
Splits density plots. They show the binned split importance and location on each gradient (spikes), kernel density of splits (blacklines), of observations (red lines) and of splits standardised by observations density (bluelines). Each distribution integrates to predictor importance. These show where important changes in the abundance of multiple alleles are occurring along the gradient; they indicate a composition change rate.
Species (in our case alleles) cumulative plots. They show, for each SNPs, the cumulative importance distributions of splits improvement scaled by \(\mathcal{R}^2\) weighted importance, and standardised by density of observations. These show cumulative change in the presence of individual allele, where changes occur on the gradient, and the alleles changing most on each gradient.
Predictor cumulative plots. They show, for each predictor, the cumulative importance distributions of splits improvement scaled by \(\mathcal{R}^2\) weighted importance, and standardised by density of observations, averaged over all SNPs. These show cumulative change in overall allelic composition, and where changes occur on the gradient.
\(\mathcal{R}^2\)measure of the fit of the random forest model for each SNPs.
The code to generate those plots comes from ‘Example analysis of biodiversity survey data with R package gradientForest’ by C. Roland Pitcher, Nick Ellis and Stephen J. Smith (pdf available here).
Code
lapply(snp_sets, function(snp_set){pdf(here(paste0("figs/GF/GFplots_",snp_set$set_code,".pdf")), width=10,height=8)# Overall importance plotplot(snp_set$gf_mod, plot.type="Overall.Importance")# splits density plotplot(snp_set$gf_mod,plot.type="S",imp.vars=names(importance(snp_set$gf_mod)),leg.posn="topright",cex.legend=0.6,cex.axis=0.6, cex.lab=0.7,line.ylab=0.9,par.args=list(mgp=c(1.5, 0.5,0),mar=c(3.1,1.5,0.1,1)))# species cumulative plotplot(snp_set$gf_mod,plot.type="C",imp.vars=names(importance(snp_set$gf_mod)), show.overall=F,legend=T,leg.posn="topleft", leg.nspecies=14,cex.lab=0.7,cex.legend=0.6, cex.axis=0.6,line.ylab=0.9,par.args=list(mgp=c(1.5, 0.5,0),mar=c(2.5,1,0.1,0.5),omi=c(0, +0.3,0,0)))# predictor cumulative plotplot(snp_set$gf_mod,plot.type="C",imp.vars=names(importance(snp_set$gf_mod)),show.species=F,common.scale=T,cex.axis=0.6, cex.lab=0.7,line.ylab=0.9,par.args=list(mgp=c(1.5, 0.5,0),mar=c(2.5,1,0.1,0.5),omi=c(0, +0.3,0,0)))# R2 measure of the fit of the random forest model for each speciesplot(snp_set$gf_mod,plot.type="P",show.names=T,horizontal=F, cex.axis=1,cex.labels=0.7,line=2.5)dev.off()})
GO predictions
Code
snp_sets <-lapply(snp_sets, function(snp_set){snp_set$go <-lapply(clim_dfs$clim_pred, function(clim_pred){ref_pred <-predict(snp_set$gf_mod) # predictions under current climatesfut_pred <-predict(snp_set$gf_mod, as.data.frame(clim_pred[,clim_var])) # predictions under future climateslapply(1:nrow(ref_pred), function(x, ref_pred, fut_pred){as.numeric(pdist(ref_pred[x,], fut_pred[x,])@dist)}, fut_pred=fut_pred, ref_pred=ref_pred) %>%unlist()})return(snp_set)})
Relationship with Euclidean distance
Code
source(here("scripts/functions/make_eucli_plot.R"))# Calculate the Euclidean climatic distancelist_dist_env <- clim_dfs$clim_pred %>%lapply(function(clim_pred){Delta = clim_dfs$clim_ref %>% dplyr::select(any_of(clim_var)) - clim_pred %>% dplyr::select(any_of(clim_var)) dist_env =sqrt( rowSums(Delta^2) )})# Main gene pools (for the figures)gps <-readRDS(here("data/GenomicData/MainGenePoolPopulations.rds")) %>%arrange(pop)
# Function to make the genomic offset mapssource(here("scripts/functions/make_go_map.R"))# Population coordinatespop_coord <-readRDS(here(paste0("data/ClimaticData/MaritimePinePops/ClimatePopulationLocationPointEstimates_ReferencePeriods_noADJ.rds")))[[1]]$ref_means %>% dplyr::select(pop,longitude,latitude)# Find minimum and maximum values of genomic offset for the mapsgo_limits <-lapply(snp_sets, function(x) {lapply(names(list_dist_env), function(gcm){x$go[[gcm]]}) %>%unlist()}) %>%unlist() %>%range()# The minimum GO value is very very small, almost zero, so we fix it to zero.go_limits[[1]] <-0# Generate the maps for each set of SNPs and each GCMlapply(snp_sets, function(x) {go_maps <-lapply(names(list_dist_env), function(gcm){make_go_map(dfcoord=pop_coord,snp_set = x,gcm=gcm,ggtitle=gcm,go_limits = go_limits,point_size =3)})legend_maps <-get_legend(go_maps[[1]])go_maps <-lapply(go_maps, function(y) y +theme(legend.position ="none"))go_maps$legend_maps <- legend_mapsgo_maps <-plot_grid(plotlist=go_maps)title <-ggdraw() +draw_label( x$set_name,fontface ='bold',x =0,hjust =0 ) +theme(plot.margin =margin(0, 0, 0, 7))# merge title and plotsplot_grid( title, go_maps,ncol =1,rel_heights =c(0.1, 1))})
Validation - NFI plots
Code
# Load the climatic data of the NFI plots.nfi_clim <-readRDS(here("data/ClimaticData/NFIplots/NFIclimate.rds"))# Keep only the climatic variables of interest and scale the climatic datanfi_dfs <-generate_scaled_clim_datasets(clim_var, clim_ref = nfi_clim$clim_ref, clim_pred = nfi_clim$clim_survey)# calculate the genomic offset for the NFI plotssnp_sets <-lapply(snp_sets, function(snp_set){ref_pred <-predict(snp_set$gf_mod, as.data.frame(nfi_dfs$clim_ref[,clim_var])) # predictions under reference-period climatesfut_pred <-predict(snp_set$gf_mod, as.data.frame(nfi_dfs$clim_pred[,clim_var])) # predictions under climates during survey periodsnp_set$go_nfi <-lapply(1:nrow(ref_pred), function(x, ref_pred, fut_pred){as.numeric(pdist(ref_pred[x,], fut_pred[x,])@dist)}, fut_pred=fut_pred, ref_pred=ref_pred) %>%unlist()return(snp_set)})# checking missing data# lapply(snp_sets, function(x) sum(is.na(x$go_nfi)))# Find minimum and maximum values of genomic offset for the mapsgo_limits <-lapply(snp_sets, function(snp_set) snp_set$go_nfi) %>%unlist() %>%range()# The minimum GO value is very very small, almost zero, so we fix it to zero.go_limits[[1]] <-0# map genomic offset predictions in the NFI plots lapply(snp_sets, function(x) make_go_map(dfcoord=readRDS(here("data/ClimaticData/NFIplots/NFIclimate.rds"))[[1]] %>% dplyr::select(contains("ude")), snp_set = x,type="NFI",point_size =0.5,go_limits = go_limits,legend_position =c(0.85,0.15),legend_box_background ="gray80",y_limits =c(35, 51)))
Validation - Common gardens
Code
cg_clim <-readRDS(here("data/ClimaticData/CommonGardens/ClimateCG.rds")) %>% dplyr::select(cg,any_of(clim_var))cg_coord <-readRDS(here("data/ClimaticData/CommonGardens/ClimateCG.rds")) %>% dplyr::select(cg,contains("ude"))cg_names <-unique(cg_coord$cg)# Generate scaled climatic datasets with climatic data at the location of the populations and at the location of the common gardenscg_dfs <-generate_scaled_clim_datasets(clim_var, clim_pred = cg_clim)# Predict genomic offset of each population when transplanted in the climate of the common gardenssnp_sets <-lapply(snp_sets, function(snp_set){ref_pred <-predict(snp_set$gf_mod, as.data.frame(cg_dfs$clim_ref[,clim_var])) # predictions under reference-period climatesfut_pred <-predict(snp_set$gf_mod, as.data.frame(cg_dfs$clim_pred[,clim_var])) # predictions under climates during survey periodsnp_set$go_cg <-lapply(1:nrow(ref_pred), function(x, ref_pred, fut_pred){as.numeric(pdist(ref_pred[x,], fut_pred)@dist)}, fut_pred=fut_pred, ref_pred=ref_pred) %>%setNames(cg_dfs$clim_ref[["pop"]]) %>%as.data.frame() %>%t() %>%as.data.frame() %>%set_colnames(cg_dfs$clim_pred[["cg"]]) %>%rownames_to_column(var="pop") %>%as_tibble()return(snp_set)})# Map genomic offset predictions at the locations of the populationsgo_maps_cg <-lapply(cg_names, function(cg_name){# Find minimum and maximum values of genomic offset for the mapsgo_limits <-lapply(snp_sets, function(snp_set) snp_set$go_cg[[cg_name]]) %>%unlist() %>%range()go_limits[[1]] <-0p <-lapply(snp_sets, function(x) make_go_map(dfcoord=pop_coord, snp_set = x,point_size =3,type="CG",go_limits = go_limits,cg_name=cg_name,cg_coord=cg_coord))plot_grid(p[[1]],p[[2]],p[[3]],nrow=1)}) %>%setNames(cg_names)pdf(here("figs/GF/GOmaps_CGs.pdf"), width=15,height=6)lapply(go_maps_cg, function(x) x)dev.off()# show mapslapply(go_maps_cg, function(x) x)
Let’s save the genomic offset predictions for comparison with the other methods.
Fitzpatrick, Matthew C, Vikram E Chhatre, Raju Y Soolanayakanahally, and Stephen R Keller. 2021. “Experimental Support for Genomic Prediction of Climate Maladaptation Using the Machine Learning Approach Gradient Forests.”Molecular Ecology Resources.